/*
Created by : Clemence, August 2018

This Dofile : 
			- Focuses on Export data, generate datasets to compare the different sources and make descriptive statistics 

Issues : Different Data sources that are supposed to be equivalent

	1 - Classic customs flow 2 and 4 :  1995-2010
	2 - flow 2  with currency : 2011 - 2017
	3 - Flow with buyer id : 1995 - 2017

Test to be done :

	- Check the consistency of flow 4 between the classic custom flow source and the flows with buyer ID
	- Check the evolution of value traded/nb of siren between 2010 and 2011 (new source)
	
What is done :

		1) Keep only valid siren
		2) Keep only valid country code
		3) Keep only valif flows (no flow should be export=0 & kgs=0)
		4) Only valid nc8 code
		5) Only valid buyer number
Some remaining problem :
		- For flows 1 & 2 after 2011 : there observations with valstat==0 & kgs==0, i don't understand why, no special patterns in terms of year or iso2
		- Similar obseravtion euros==0 & kgs==0 in buyers data
Output :

		- exp_by_year, exhaustive exports flows 2 and 4 from 1995 to 2017, data are aggregated at the siren*iso2*year level. 
		  Data are exhaustive because they include under threshold observations withut products
		- exp_by_nc8year, exports flows 2 and 4 from 1995 to 2017 siren*iso2*year*nc8 level.
		- Excel file: SummaryStatistics_24
		  
To be deleted later 
		
		- Clean Data from classic mensuel : cm_by_year.dta,  cm_by_nc8year
		- Clean data from currency data : cur_by_year and cur_by_nc8year
		- Firm to firm data : buyer_by_year, buyer_by_nc8year and buyer_by_buyernc8year

*/

set more off, perma
cd $sourcepath	
do "$dopath\id_group.do"
	
*1 Source : Classic customs flow	
use flux24_cm.dta, clear
/* 
  count if nc8==.
  0
. count if pyod==""
  0
. count if euros==0 & kgs==0
  0 */

* Variables an   mois     flux  siren   nc8    cpa6    pyod   payp  euros  indicmasse  kgs unites   year
drop payp // no obs only for imports
rename pyod iso2
rename euros export

*1.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7
#delimit ;
count if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999"| length(siren)<=5;
egen drop=sum(export) if siren=="000000000"|siren=="111111111"|siren=="222222222"|siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999"| length(siren)<=5;
#delimit cr


#delimit ; 
*putexcel A2=("Total (value)")  A3=("Missing Siren (value)") A4=("Missing Destination (Value)") A8=("Coverage") 
* B1=("Classic Customs Data : 95-2010") 
* B2=(`tot') B3=(`drop_') ; 
 drop if length(siren)<=5 ;
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999";
#delimit cr


*1.2 Keep valid pyod

/*
*Codes QQ,QU,QV,QW are anonymized for defense reason ==> dropped
*Codes TP XA XC XE XF XG XH XI XK XL XM XO XP XR XS  are user-defined codes ==> dropped"
*https://www.iso.org/fr/glossary-for-iso-3166.html
Codes attribués par les utilisateurs - Si des utilisateurs ont besoin de codes pour représenter des noms de pays qui ne font pas partie 
de la présente partie de l'ISO 3166, les séries de lettres AA, QM à QZ, XA à XZ et ZZ et les séries AAA à AAZ, QMA à QZZ, XAA à XZZ et ZZA 
à ZZZ et la série des nombres 900 à 999 peuvent être employées.
* XU = Belgiuem and Luxembourg until 99
* YU Yougoslavia until 2003
* CS : Serbia Montenegra 2004-2005
* TP : Portuguese Timor in 2001-2002
* AN : Netherland Antilles from 95- 2010
*/
drop drop
gen drop=1 if  inlist(iso2,"QQ","QU","QV","QW")
replace drop=1 if inlist(iso2,"XA","XC","XE","XF","XG")
replace drop=1 if inlist(iso2,"XH","XI","XK","XL","XM")
replace drop=1 if inlist(iso2,"XO","XP","XR","XS","XV","XZ")

drop if drop==1
compress
save clean_flux_24.dta, replace

*Aggregate at the siren*iso2*year level
collapse (sum) export kgs unites, by(siren iso2 year flux)
sort siren iso2 year
gen source="Classic Mensuel"
compress
save cm_by_year.dta, replace


*1.3 Clean Products
use clean_flux_24, clear
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")

drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 
* drop drop2
compress
keep export nc8 iso2 siren kgs unites mois year flux
gen source="cm"
save clean_flux_24.dta, replace

*Aggregate at the siren*iso2*year*nc8 level
collapse (sum) export kgs unites, by(siren iso2 year nc8 flux)
sort siren iso2 year
gen source="Classic Mensuel"
save cm_by_nc8year.dta, replace



*2 Source : Currency customs flows 2
cd $datapath
use x_cur.dta, clear

keep if flux==2
rename pyod iso2
rename euros export

*2.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7

#delimit ;
egen drop=sum(export) if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999"| length(siren)<=5;
#delimit cr



#delimit ; 
 drop if length(siren)<=5 ;
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999";
#delimit cr

*2.2 Keep valid pyod
/*
Many EU code as destination country :
	- can stand for US "Etats Unis"
	- codes used when people don't know what to put
	- code used for re-exported goods from switzerland
FR code as the origin of Imports can happen when
	a) French goods are imported from another cpountry and the re-exported from this country to France. 
       Ex: A Belgian  wine delaer imports champagn and then sale part of it in France 

	b) Defective exported  goods taht are re-imported, temporary exports
	c) Misreporting
Decison : Drop thiese flows
FR code as the destination country
	Could be related to exports from DOM-TOM 
	==> dropped
*/
drop drop
gen drop=1 if  inlist(iso2,"QP","QU","QV","QW","FR")
replace drop=1 if inlist(iso2,"XC","XK","XL","XS")
replace drop=1 if iso2=="EU"



drop if drop==1

compress
save clean_x_cur.dta, replace


*Aggregate at the siren*iso2*year level
collapse (sum) export kgs unites, by(siren iso2 year flux)
gen source="Currency"
sort siren iso2 year
append using cm_by_year
save exp_by_year.dta, replace
erase cm_by_year.dta



*2.3 Clean Product
use clean_x_cur, clear
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")
drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 
save clean_x_cur, replace

*Aggregate at the siren*iso2*year*nc8 level
collapse (sum) export kgs unites, by(siren iso2 year nc8 flux)
gen source="Currency"
sort siren iso2 year
append using cm_by_nc8year
save exp_by_nc8year.dta, replace
erase cm_by_nc8year.dta

use clean_x_cur, clear
gen source="currency"
keep export nc8 iso2 siren kgs unites mois year flux source
append using clean_flux_24.dta
save clean_flux_24.dta, replace
erase clean_x_cur.dta


*3 Source : Buyer custom flows
use "$rawbuyer\x_all_new_ss.dta", clear

cap drop year
rename an year
rename pyod iso2
rename euros export
gen flux2=4 if flux=="E"
drop flux
rename flux2 flux

*3.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7



#delimit ; 
 drop if length(siren)<=5 ;
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999";
#delimit cr

*3.2 Keep valid pyod
* Before 1999: XU for BEL and LUX - but info on the destination country with iso2_tva in 98% of the cases 
* below a threshold, iso2 is not reported ("00") but iso2_tva can be
* in VAT number, Greece is refered to as EL
gen iso2_tva=substr(numtvaanonym,1,2)
replace iso2=iso2_tva if iso2=="XU" & (iso2_tva=="BE"|iso2_tva=="LU")
replace iso2="BE" if iso2=="XU"
replace iso2_tva="GR" if iso2_tva=="EL"

*For observations under_threshold : No pyod ==> attribute iso_tva
gen missing_iso2=missing(iso2)
replace iso2=iso2_tva if missing_iso2==1
*By doing that some invalid iso2 appears ==> dropped
drop if inlist(iso2,"FR","GL","PA","XX") // (435 observations deleted)

*3.3 Keep relevant regim
*Regime 22 : "Transfert d'un bien taxable dans l'Etat d'arrivée" has been absorbed by regim 21 after 2001
*Regime 23 "Facturation d'un travail à façon" and regime 24 "Envoi de matériaux en vue de travail à façon" ==> has been absorbed by regime 29 after 1997
*Drop flux 25 and 26 as they do not refer to a particular transaction (outside of trade statistics)
replace regdem=21 if regdem==22 
replace regdem=29 if regdem==23 |regdem==24
drop if regdem==25 | regdem==26 // not included usually, no buyer, no nc8


*3.4 : remove flows with export=0 and kgs=0
drop if export==0 & kgs==0


preserve 
*Append under-threshold and after 2010 obs to the export data base
keep if ss==1 | year>2010
collapse (sum) export kgs unites (mean) ss, by(siren iso2 year flux)
gen source="f2f"
sort siren iso2 year
compress
append using exp_by_year.dta
save exp_by_year.dta, replace
restore


*3.5 Clean CN8 products
* We want to keep only valid NC8 codes
* NC8 99500000 is a code used for small transactions and "envois groupés" 
* NC8 9880XX00 - XX being 73, 84, 99 or 85 - is a code used for large manufacturing or commercial firms which import values above 3 million euros and benefit from a simplified code grouping all products together
* L'arrêté du 26 décembre 2000 (modifié) fixe les conditions d'utilisation de la nomenclature spécifique des produits 
* 99050000 : personnal belongings of moving individuals
* 99190000 : belongings of catastrophy victims (coffin...) 
* 9930 : Refuelling
* 9931 : goods for offsghores installations/infrastructures
* Seek for more informations in the note. 
* In flux 13, non-valid cn8 account for about 0.5 percent of transactions, 97.31 of which are "99500000". 97.49% of non-valid cn8 are in flux 3 : no 99500000
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")

drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 

preserve 
*Aggregate at the siren*iso2*nc8*year level
collapse (sum) export kgs unites , by(siren iso2 year flux nc8)
gen source="f2f"
sort siren iso2 year
keep if year>2010
compress
append using exp_by_nc8year.dta
save exp_by_nc8year.dta, replace
restore


* 3.6 : Clean buyer number
encode(control), gen(encode2)
label define control 1 "Not found buyer's VAT number"  2 "Buyer's VAT number recoded but Not found"  3 "Recoded and verified buyer VAT number" 4 "Verified initial buyer VAT number"
label values encode2 control
gen drop=1 if encode2==1 | encode2==2 
gen drop2=1 if numtvaanonym==""

id_group buyer "num iso2"

preserve 
*Aggregate at the siren*iso2*nc8*year level
collapse (sum) export kgs unites , by(siren iso2 year flux buyer num nc8)
sort siren iso2 year
compress
save buyer_by_buyernc8year.dta, replace
restore

erase clean_flux_24.dta
 
 use exp_by_year.dta, clear
 collapse (sum) export kgs unites, by(siren iso2 year)
sort siren iso2 year
compress
save exp_by_sireniso2year_19932017.dta, replace

use exp_by_nc8year.dta, clear
 collapse (sum) export kgs unites, by(siren iso2 year nc8)
sort siren iso2 year
compress
save exp_by_sirennc8iso2year_19932017.dta, replace

erase exp_by_year.dta
erase exp_by_nc8year.dta
erase temp.dta

